CRAN Distribution of GitHub Packages

We study the CRAN distribution of GitHub R Packages. In particular, we focus on a survival analysis based on the time needed for a R package on GitHub to appear on CRAN.



In [1]:

    
import pandas
from matplotlib import pyplot as plt

%matplotlib inline
from IPython.display import set_matplotlib_formats
#set_matplotlib_formats('pdf')

cran_release = pandas.DataFrame.from_csv('../data/cran-packages-150601.csv', index_col=None)
data = pandas.DataFrame.from_csv('../data/github-cran-bioc-alldata-150420.csv', index_col=None)



In [2]:

    
github = data.query('Source == "github"').sort('Date').drop_duplicates('Package').set_index('Package')
github.rename(columns={'Date': 'github'}, inplace=True)



In [3]:

    
cran = cran_release.sort('mtime').drop_duplicates('package').rename(columns={'package': 'Package', 'mtime': 'cran'})[['Package', 'cran']].set_index('Package')



In [4]:

    
# Keep GitHub packages
packages = github.join(cran, how='left')

# Fix datetimes
packages['github'] = pandas.to_datetime(packages['github']).astype('datetime64[ns]')
packages['cran'] = pandas.to_datetime(packages['cran']).astype('datetime64[ns]')

# Compute delta & sort (cosmetic)
packages['elapsed'] = packages['cran'] - packages['github']
packages.sort('elapsed', inplace=True)

# In days
packages['elapsed'] = packages['elapsed'] / pandas.np.timedelta64(1, 'D')

# Do GitHub packages have a CRAN dependency?
packages.fillna({'Depends': '', 'Imports': ''}, inplace=True)
cran_deps = lambda r: any((p in cran.index for p in 
  [x.strip() for x in r['Depends'].split(' ') if len(x.strip())>0] + [x.strip() for x in r['Imports'].split(' ') if len(x.strip())>0]
))
packages['cran_deps'] = packages.apply(cran_deps, axis=1)



In [5]:

    
ax = packages['elapsed'].plot(kind='hist', bins=50)
ax.set_title('Time needed for a GitHub package to appear on CRAN')
ax.figure.set_size_inches(12, 4)
ax.set_xlabel('Duration in days')
ax.set_ylabel('Number of GitHub R packages')









    Out[5]:





<matplotlib.text.Text at 0x7f3b24e79b50>

Which are those outliers?



In [6]:

    
packages.sort('elapsed', ascending=False)[:5][['github', 'cran', 'elapsed']]









    Out[6]:






  
    
      
      github
      cran
      elapsed
    
    
      Package
      
      
      
    
  
  
    
      emu
      2001-02-08 23:38:11
      2008-02-16 14:56:40
      2563.637836
    
    
      xoi
      2009-04-28 17:44:42
      2012-10-15 17:30:08
      1265.989884
    
    
      broman
      2009-04-28 18:04:00
      2012-10-15 16:29:09
      1265.934132
    
    
      qtutils
      2009-06-19 18:27:47
      2012-05-09 15:25:46
      1054.873600
    
    
      qiimer
      2010-11-22 15:29:41
      2013-10-05 23:12:15
      1048.321227

Let's prepare the data for a survival analysis.



In [7]:

    
import lifelines









    



Using pure Python version of concordance index. 
You can speed this up 100x by compiling the Fortran code with:
>>> python setup.py build_ext --inplace



In [8]:

    
survival = packages.copy()

# Remove packages that were first on CRAN
survival = survival.fillna(pandas.datetime(2050,1,1)).query('github < cran').replace(pandas.datetime(2050,1,1), pandas.np.nan)

# Observed packages
survival['observed'] = survival['elapsed'].apply(lambda x: not pandas.np.isnan(x))

# Censored packages (NaN) are set to now
survival['elapsed'] = survival.apply(lambda r: r['elapsed'] if not pandas.np.isnan(r['elapsed']) 
               else (pandas.datetime.now() - r['github']) / pandas.np.timedelta64(1, 'D'), axis=1)

print len(packages), len(survival), len(survival[survival['observed']])









    



5730 5225 803



In [9]:

    
kmf = lifelines.KaplanMeierFitter() 

plot_groups = [
    [
        {'label': 'All', 'df': survival.query('observed or not observed')}, 
        {'label': 'Without outliers', 'df': survival.query('observed == False or elapsed < 1200')},
    ],[
        {'label': 'Version >= 1', 'df': survival[survival['Version'].str[0] >= '1']},
        {'label': 'Version < 1', 'df': survival[survival['Version'].str[0] < '1']},
    ],[
        {'label': 'With CRAN dependencies', 'df': survival.query('cran_deps == True')},
        {'label': 'Without CRAN dependencies', 'df': survival.query('cran_deps == False')},
    ], 
]

for i, group in enumerate(plot_groups):
    ax = plt.subplot(len(plot_groups), 1, i+1)
    for cond in group:
        print '{}: {} items, {} observed'.format(cond['label'], len(cond['df']), len(cond['df'].query('observed == True')))
        kmf.fit(cond['df']['elapsed'], event_observed=cond['df']['observed'], label=cond['label'])
        ax = kmf.plot(ax=ax)
    ax.set_title('Kaplan Meier Estimation, duration needed for a GitHub package to appear on CRAN')
    ax.figure.set_size_inches(12, 8)
    ax.set_xlabel('Duration (in days)')









    



All: 5225 items, 803 observed
Without outliers: 5222 items, 800 observed
Version >= 1: 1740 items, 219 observed
Version < 1: 3485 items, 584 observed
With CRAN dependencies: 2844 items, 495 observed
Without CRAN dependencies: 2381 items, 308 observed

	github	cran	elapsed
Package
emu	2001-02-08 23:38:11	2008-02-16 14:56:40	2563.637836
xoi	2009-04-28 17:44:42	2012-10-15 17:30:08	1265.989884
broman	2009-04-28 18:04:00	2012-10-15 16:29:09	1265.934132
qtutils	2009-06-19 18:27:47	2012-05-09 15:25:46	1054.873600
qiimer	2010-11-22 15:29:41	2013-10-05 23:12:15	1048.321227